#
Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# Bootstrap analysis of efficacy of the Pfizer-BioNTech COVID-19
Vaccine
# The Phase III clinical trial of Pfizer vaccine included n=44000 people,
# 50% took trt, 50% placebo.
# Result: 8 vaccinated and 162 placebo participants contracted COVID-19.
# A conclusion was made that vaccination reduces the risk of COVID by
# the factor of 20.
# Vaccine critics point to a small sample size of 170 for any meaningful
# conclusions. Here, we explore what confidence statements we can deduce
# from these data.
# Construct a Bootstrap CI for p1/p2 = the ratio of probabilities
# p1 = P( infection | placebo group ), p2 = P( infection | vaccine group )
V = c(rep(0,22000),rep(1,22000)); # 0 = placebo, 1 = treatment
X = c(rep(0,21838),rep(1,162),rep(0,21992),rep(1,8)); # 1 = contracted covid
XV = data.frame(X,V);
pratio = function(xv,Z){ # function of data xv and a subsample Z
x = xv[,1]; v = xv[,2];
Pplacebo = sum(x[Z]==1 & v[Z]==0)/sum(v[Z]==0);
Pvaccine = sum(x[Z]==1 & v[Z]==1)/sum(v[Z]==1);
return( Pplacebo/Pvaccine )
}
pratio(XV,)
# The ratio is estimated as 20.25. We need to supply it with the margin of error
# and a confidence level. That is, use Bootstrap to construct a confidence interval.
library(boot)
b = boot( XV, pratio, R=10000 ) # Apply bootstrap
quantile(b$t,c(0.025,0.975)) # This is a 95% confidence interval for p1/p2
# 2.5% 97.5% # Thus, we can claim with 95% confidence that the ratio of
# 11.34329 54.51769 # probabilities is between 11.3 and 54.5. But do we need an interval,
# or should we just find a lower bound?
quantile(b$t,0.05) # Try a one-sided confidence interval.
# 5% # We can now claim, also with 95% confidence, that vaccination
# 12.29509 # reduces the chance of COVID-19 infection by at least a factor of 12.
################################################################################
### Official efficacy of the vaccine ###
### The efficacy of a vaccine is defined as the difference of risks between ###
### the vaccine group and the placebo group, relative to the placebo group. ###
################################################################################
### To study the efficacy of Pfizer, we modify our function:
efficacy = function(xv,Z){ # function of data xv and a subsample Z
x = xv[,1]; v = xv[,2];
Pplacebo = sum(x[Z]==1 & v[Z]==0)/sum(v[Z]==0);
Pvaccine = sum(x[Z]==1 & v[Z]==1)/sum(v[Z]==1);
return( (Pplacebo - Pvaccine)/Pplacebo ) }
pratio(XV,) # This gives the estimated efficacy which you see in all official reports
# 0.9506173 # But we know that it is only based on the sample, so it has a margin of error.
# So, we proceed by building a confidence interval
b = boot( XV, efficacy, 10000 ) # Bootstrap!
hist(b$t) # This is a histogram of bootstrap efficacy values, it's not very symmetric
# I'm just checking normality, to see if we can settle with a PARAMETRIC
# bootstrap confidence interval, which is the mean b$t, plus or minus 1.96
# standard deviations
shapiro.test(b$t) # Shapiro-Wilk test is a rigorous test of normality
Error in shapiro.test(b$t) : sample size must be between 3 and 5000
# The way it's built in R does not handle samples > 5000
shapiro.test(b$t[sample(10000,5000)]) # Ok, we select a random sample of n=5000
# Shapiro-Wilk normality test
#
# data: b$t[sample(10000, 5000)]
# W = 0.98926, p-value < 2.2e-16 # Shapiro-Wilk test rejects normal distribution.
# Hence, we construct a NONPARAMETRIC
quantile( b$t, c(0.025,0.975) ) # bootstrap confidence interval
# 2.5% 97.5%
# 0.9105982 0.9816955 # This is a 95% confidence interval for efficacy
quantile( b$t, c(0.005,0.995) )
0.5% 99.5%
0.8947269 0.9882617 # This is a 99% confidence interval